Here, we are trying to explain and demonstrate how to use the ProAli to align the goldfish genome assembly against the zebrafish reference genome.
Firstly, we prepare the input files, the zebrafish reference genomes in fasta format, the zebrafish reference genome annotation in gff(3) format, and the goldfish query genome file in fasta format.

Download the maize genome sequence and reference GFF file

wget http://ftp.ensembl.org/pub/release-91/fasta/danio_rerio/dna/Danio_rerio.GRCz10.dna.toplevel.fa.gz
wget http://ftp.ensembl.org/pub/release-91/gff3/danio_rerio/Danio_rerio.GRCz10.91.chr.gff3.gz
wget https://research.nhgri.nih.gov/goldfish/download/carAur01.sm.fa
gunzip *gz

Genome masking

Genome masking would not improve the performance of AnchorWave.
AnchorWave do not ultilize any soft masking information. Hard masking would increase the computational cost of AnchorWave.

Check the collinearity and whole-genome duplication for genomes under comparing

Since the ProAli use collinearity blocks and whole genome duplications to guide the genome alignment. Before running the alignment, we use minimap2 to map reference full-length CDS to query genome, visualize the mapping and get ideas about the collinearity and whole-genome duplication.

Overview the genome assemblies

firstly I indexed the genome files using samtools faidx command

samtools faidx carAur01.sm.fa
samtools faidx Danio_rerio.GRCz11.dna.primary_assembly.fa

Then we check the name and sequence length for each sequence record manually.
Then we roughly know they are chromosome level genome assemblies(not contig level).
To check collinearity and whole-genome duplication, we only need to focus on chromosomes.
They are "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "MT" for “Branchiostoma_lanceolatum.BraLan2.dna.toplevel.fa”
and "LG1","LG2","LG3", "LG4", "LG5", "LG6", "LG7", "LG8", "LG9", "LG10", "LG11", "LG12", "LG13", "LG14", "LG15", "LG16", "LG17", "LG18", "LG19", "LG20", "LG21", "LG22", "LG23", "LG24", "LG25", "LG26", "LG27", "LG28", "LG28B", "LG29", "LG30", "LG30F", "LG31", "LG32", "LG33", "LG34", "LG35", "LG36", "LG36F", "LG37", "LG37M", "LG38", "LG39", "LG40", "LG41", "LG42", "LG42F", "LG43", "LG44", "LG44F", "LG45", "LG45M", "LG46", "LG47", "LG48", "LG48F", "LG49", "LG49B", "LG50" carAur01.sm.fa

since other contigs maight be unclasped heterozygous regions, we only keep chrosomes

head -15923983 carAur01.sm.fa > goldfish.fa



Get full-length CDS in fasta format

NOTE: please do NOT use CDS extracted using other software. Since AnchorWave filtered some CDS records to minimum the side effect of minimap2 limitation that “Minimap2 often misses small exons” (https://github.com/lh3/minimap2#limitations)

anchorwave gff2seq -r Danio_rerio.GRCz11.dna.primary_assembly.fa -i Danio_rerio.GRCz11.102.chr.gff3 -o cds.fa

Use minimap2 to map the extracted full-length CDS to the reference and query genomes

minimap2 -x splice -t 11 -k 12 -a -p 0.4 -N 20 Danio_rerio.GRCz11.dna.primary_assembly.fa cds.fa > ref.sam
minimap2 -x splice -t 11 -k 12 -a -p 0.4 -N 20 goldfish.fa cds.fa > carAur.sam

Visualize the full-length CDS mapping result

perl alignmentToDotplot.pl Danio_rerio.GRCz11.102.chr.gff3 carAur.sam > carAur.tab
library(ggplot2)
library(compiler)
enableJIT(3)
## [1] 3
library(ggplot2)
library("Cairo")


changetoM <- function ( position ){
  position=position/1000000;
  paste(position, "M", sep="")
}


data =read.table("carAur.tab")
data = data[which(data$V1 %in% c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "MT" )),]
data = data[which(data$V3 %in% c("LG1","LG2","LG3", "LG4", "LG5", "LG6", "LG7", "LG8", "LG9", "LG10", "LG11", "LG12", "LG13", "LG14", "LG15", "LG16", "LG17", "LG18", "LG19", "LG20", "LG21", "LG22", "LG23", "LG24", "LG25", "LG26", "LG27", "LG28", "LG28B", "LG29",  "LG30", "LG30F", "LG31", "LG32", "LG33", "LG34", "LG35", "LG36", "LG36F", "LG37", "LG37M", "LG38", "LG39",  "LG40", "LG41", "LG42", "LG42F", "LG43", "LG44", "LG44F", "LG45", "LG45M", "LG46", "LG47", "LG48", "LG48F", "LG49", "LG49B", "LG50")),]
data$V1 = factor(data$V1, levels=c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "MT" ))
data$V3 = factor(data$V3, levels=c("LG1","LG2","LG3", "LG4", "LG5", "LG6", "LG7", "LG8", "LG9", "LG10", "LG11", "LG12", "LG13", "LG14", "LG15", "LG16", "LG17", "LG18", "LG19", "LG20", "LG21", "LG22", "LG23", "LG24", "LG25", "LG26", "LG27", "LG28", "LG28B", "LG29",  "LG30", "LG30F", "LG31", "LG32", "LG33", "LG34", "LG35", "LG36", "LG36F", "LG37", "LG37M", "LG38", "LG39",  "LG40", "LG41", "LG42", "LG42F", "LG43", "LG44", "LG44F", "LG45", "LG45M", "LG46", "LG47", "LG48", "LG48F", "LG49", "LG49B", "LG50" ))
p = ggplot(data=data, aes(x=V4, y=V2))+geom_point(size=1, aes(color=V5))+facet_grid(V1~V3, scales="free", space="free" )+ theme_grey(base_size = 60) +
    labs(x="goldfish", y="zebrafish")+scale_x_continuous(labels=changetoM) + scale_y_continuous(labels=changetoM) +
    theme(axis.line = element_blank(),
          panel.background = element_blank(),
          panel.border = element_rect(fill=NA,color="black", size=0.5, linetype="solid"),
          axis.text.y = element_text( colour = "black"),
          legend.position='none',
          axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black") )
CairoPNG(file="carAur.png",width = 12000, height = 7000)
p

dev.off()
## png 
##   2

Using ProAli to identify collinear region and plot collinear anchors

Considering the genome duplication and translocation variations, we use the proali function to identify collinear region.
We expect the genome alignment coverage on reference genome (zebrafish) equal with or smaller than 2. We expect the genome alignment coverage on query genome (goldsifh) equal with or smaller than 1. So we set parameter -R 2 -Q 1 As this stage we would like to identify collinear anchors and do not perform base-pair resolution sequence alignment. So we only set -n to output collinear anchors. By setting -ns , we do not would like to identify novel anchors.

anchorwave proali -i Danio_rerio.GRCz11.102.chr.gff3 -r Danio_rerio.GRCz11.dna.primary_assembly.fa -a carAur.sam -as cds.fa -ar ref.sam -s goldfish.fa -n align1.anchors -R 2 -Q 1 -ns
library(ggplot2)
library(compiler)
enableJIT(3)
## [1] 3
library(ggplot2)
library("Cairo")


changetoM <- function ( position ){
  position=position/1000000;
  paste(position, "M", sep="")
}

data =read.table("align1.anchors", head=TRUE)
data = data[which(data$refChr %in% c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "MT" )),]
data = data[which(data$queryChr %in% c("LG1","LG2","LG3", "LG4", "LG5", "LG6", "LG7", "LG8", "LG9", "LG10", "LG11", "LG12", "LG13", "LG14", "LG15", "LG16", "LG17", "LG18", "LG19", "LG20", "LG21", "LG22", "LG23", "LG24", "LG25", "LG26", "LG27", "LG28", "LG28B", "LG29",  "LG30", "LG30F", "LG31", "LG32", "LG33", "LG34", "LG35", "LG36", "LG36F", "LG37", "LG37M", "LG38", "LG39",  "LG40", "LG41", "LG42", "LG42F", "LG43", "LG44", "LG44F", "LG45", "LG45M", "LG46", "LG47", "LG48", "LG48F", "LG49", "LG49B", "LG50")),]
data$refChr = factor(data$refChr, levels=c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "MT" ))
data$queryChr = factor(data$queryChr, levels=c("LG1","LG2","LG3", "LG4", "LG5", "LG6", "LG7", "LG8", "LG9", "LG10", "LG11", "LG12", "LG13", "LG14", "LG15", "LG16", "LG17", "LG18", "LG19", "LG20", "LG21", "LG22", "LG23", "LG24", "LG25", "LG26", "LG27", "LG28", "LG28B", "LG29",  "LG30", "LG30F", "LG31", "LG32", "LG33", "LG34", "LG35", "LG36", "LG36F", "LG37", "LG37M", "LG38", "LG39",  "LG40", "LG41", "LG42", "LG42F", "LG43", "LG44", "LG44F", "LG45", "LG45M", "LG46", "LG47", "LG48", "LG48F", "LG49", "LG49B", "LG50" ))
p = ggplot(data=data, aes(x=queryStart, y=referenceStart))+geom_point(size=1, aes(color=strand))+facet_grid(refChr~queryChr, scales="free", space="free" )+ theme_grey(base_size = 60) +
    labs(x="goldfish", y="zebrafish")+scale_x_continuous(labels=changetoM) + scale_y_continuous(labels=changetoM) +
    theme(axis.line = element_blank(),
          panel.background = element_blank(),
          panel.border = element_rect(fill=NA,color="black", size=0.5, linetype="solid"),
          axis.text.y = element_text( colour = "black"),
          legend.position='none',
          axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black") )
CairoPNG(file="align1.anchors.png",width = 12000, height = 7000)
p

dev.off()
## png 
##   2

Perform genome alignment

Here we set scoring parameters -B -O1 -E1 -O2, to compare the alignment result of using different scoring parameters.
Using the wavefront alignment (WFA) algorithm, the match score is constantly set as 0. Empirically, we always set -E2 as -1.
We set -w, and -fa3 to minimum the usage of sliding window alignment approach and novel anchors. Those parameter would improve the alignment while cost memory than the default parameters.
With the following setting, each thread could cost as high as 50 Gb memory, and we ran this command on a computer with 512Gb memory available with 10 threads.

/usr/bin/time ./anchorwave proali -i Danio_rerio.GRCz11.102.chr.gff3 -r Danio_rerio.GRCz11.dna.primary_assembly.fa \
  -a carAur.sam -as cds.fa -ar ref.sam -s goldfish.fa -n align2.anchors -o align2.maf -t 7 -R 2 -Q 1 -B -4 -O1 -4 -E1 -2 -O2 -80 -E2 -1 \
  -f align2.f.maf -w 38000 -fa3 200000 > fishlog2 2>&1

Supplemental Table 5. List of 436,035 representative ATAC-seq peak.txt is from https://www.nature.com/articles/s41586-020-2962-9

and rename it to cre.bed

maf-convert sam align2.maf | sed 's/Danio_rerio.GRCz11.dna.primary_assembly.fa.//g' | sed 's/goldfish.fa.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > align.bam
samtools depth align.bam | wc -l #1145404904
samtools depth align.bam | awk '$3>0{print $0}' | wc -l # 313676584

sed -i -E 's/^chr//g' cre.bed
samtools depth -b cre.bed align.bam | wc -l #191251221
samtools depth -b cre.bed align.bam | awk '$3>0{print $0}' | wc -l #53884567
minimap2 -x asm5 -t 20 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish.fa > minimap2_goldfish_5.sam
minimap2 -x asm10 -t 20 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish.fa > minimap2_goldfish_10.sam
minimap2 -x asm20 -t 20 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish.fa > minimap2_goldfish_20.sam

samtools sort minimap2_goldfish_5.sam > minimap2_goldfish_5.bam
samtools sort minimap2_goldfish_10.sam > minimap2_goldfish_10.bam
samtools sort minimap2_goldfish_20.sam > minimap2_goldfish_20.bam

samtools depth minimap2_goldfish_5.bam | wc -l   #1350589
samtools depth minimap2_goldfish_10.bam | wc -l   #6407183
samtools depth minimap2_goldfish_20.bam | wc -l   #35881644

samtools depth minimap2_goldfish_5.bam | awk '$3>0{print $0}' | wc -l # 1343192
samtools depth minimap2_goldfish_10.bam | awk '$3>0{print $0}' | wc -l # 6329847
samtools depth minimap2_goldfish_20.bam | awk '$3>0{print $0}' | wc -l # 35002774

lastdb -P0 zebrafish Danio_rerio.GRCz11.dna.primary_assembly.fa
faToTwoBit Danio_rerio.GRCz11.dna.primary_assembly.fa zebrafish.2bit
faSize -detailed Danio_rerio.GRCz11.dna.primary_assembly.fa > zebrafish.size
lastal zebrafish goldfish.fa > goldfish_lastal.maf
faSize -detailed goldfish.fa > goldfish.size
faToTwoBit goldfish.fa goldfish.2bit
python2 /programs/last-932/bin/maf-convert psl goldfish_lastal.maf > goldfish_lastal.psl

axtChain -linearGap=loose -psl goldfish_lastal.psl -faQ -faT Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish.fa goldfish_lastal.chain
chainMergeSort goldfish_lastal.chain > goldfish_lastal.all.chain
chainPreNet goldfish_lastal.all.chain zebrafish.size goldfish.size goldfish_lastal.preNet
chainNet goldfish_lastal.preNet zebrafish.size goldfish.size goldfish_lastal.refTarget.net goldfish_lastal.chainNet
netToAxt goldfish_lastal.refTarget.net goldfish_lastal.preNet zebrafish.2bit goldfish.2bit stdout | axtSort stdin goldfish_lastal.axt
axtToMaf goldfish_lastal.axt zebrafish.size goldfish.size goldfish_lastal_final.maf -qPrefix=query. -tPrefix=col.

perl lastFinalToSplit.pl goldfish_lastal_final.maf > goldfish_lastal_final_forsplit.maf
cat goldfish_lastal_final_forsplit.maf | last-split > goldfish_lastal_final_split.maf

maf-convert sam goldfish_lastal.maf| samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish_lastal.bam  #many-to-many
maf-convert sam goldfish_lastal_final.maf| sed 's/col.//g' | sed 's/query.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish_lastal_final.bam # many to one
maf-convert sam goldfish_lastal_final_split.maf | sed 's/col.//g' | sed 's/query.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish_lastal_final_split.bam



samtools index goldfish_lastal.bam
samtools index goldfish_lastal_final.bam
samtools index goldfish_lastal_split.bam
samtools index goldfish_lastal_final_split.bam

samtools depth goldfish_lastal.bam | wc -l   #628324746
samtools depth goldfish_lastal_split.bam | wc -l   #210589780
samtools depth goldfish_lastal_final.bam | wc -l   #556518096
samtools depth goldfish_lastal_final_split.bam | wc -l   #252695645


samtools depth goldfish_lastal.bam | awk '$3>0{print $0}' | wc -l # 624893762
samtools depth goldfish_lastal_split.bam | awk '$3>0{print $0}' | wc -l # 207926386
samtools depth goldfish_lastal_final.bam | awk '$3>0{print $0}' | wc -l # 547064597
samtools depth goldfish_lastal_final_split.bam | awk '$3>0{print $0}'| wc -l # 248353893

perform genome alignment using MUMmer4 (https://mummer4.github.io/) and summarize the result

nucmer -t 75 --sam-long=mumer.goldfish.short.sam Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish.fa
cat mumer.goldfish.short.sam | grep -v "@" | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > mumer.goldfish.short.bam
samtools index mumer.goldfish.short.bam
samtools depth mumer.goldfish.short.bam | wc -l   # 94867701
samtools depth mumer.goldfish.short.bam | awk '$3>0{print $0}' | wc -l   # 93794614

perform genome alignment using GSAlign(https://github.com/hsinnan75/GSAlign) and summarize the result

GSAlign -r Danio_rerio.GRCz11.dna.primary_assembly.fa -q goldfish.fa -t 18 -o goldfish_gsalign -fmt 1
python2 /programs/last-932/bin/maf-convert sam goldfish_gsalign.maf | sed 's/qry.//g' | sed 's/ref.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish_gsalign.bam

samtools index goldfish_gsalign.bam
samtools depth goldfish_gsalign.bam | wc -l # 20059298
samtools depth goldfish_gsalign.bam | awk '$3>0{print $0}' | wc -l # 19574261
genomesize = read.table("Danio_rerio.GRCz11.dna.primary_assembly.fa.fai")
genomesize = sum(genomesize$V2)


library(ggplot2)
dat = read.table("goldFishSummary", sep="\t", header=TRUE)
dat$category = factor(dat$category, levels=c("unaligned", "gap", "position match"))
dat$approach <- factor(dat$approach, levels = c("AnchorWave", "minimap2_asm5", "minimap2_asm10", "minimap2_asm20", "LAST+many-to-many", "LAST+many-to-one", "LAST+one-to-one", "MUMmer4", "GSAlign"))

dat$proportion = dat$proportion/genomesize
p = ggplot(data=dat, aes(x=approach, y=proportion, fill=category)) + geom_bar(stat="identity")  +  scale_fill_manual(values=c("#54AEE1", "#92A000", "#EF8600")) + guides(fill=guide_legend(nrow=1,byrow=TRUE))+
  labs(x="", y="Genome proportion", title="")+
            theme_bw() +theme_grey(base_size = 24) + theme(
        strip.background = element_blank(),
        strip.text.x = element_blank(),
        legend.title = element_blank(),
              axis.line = element_blank(),
        legend.position = "top",
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            panel.border = element_blank(),
        
            panel.background = element_blank(),
            axis.text.y = element_text( colour = "black"),
            axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))

print(p)

file = "genome_aligned"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png 
##   2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png 
##   2
d = read.table("goldfish.fa.fai")
sum(d$V2)/1000/1000/1000 # 1.273912
## [1] 1.273912
d = read.table("Danio_rerio.GRCz11.dna.primary_assembly.fa.fai")
d = head(d, 25)
sum(d$V2)/1000/1000/1000 # 1.345102
## [1] 1.345102

Then we seperated two subgenomes

seqkit grep -n -p LG1 goldfish.fa > goldfish1.fa
seqkit grep -n -p LG2 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG3 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG4 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG5 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG6 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG7 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG8 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG9 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG10 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG11 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG12 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG13 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG14 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG15 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG16 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG17 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG18 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG19 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG20 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG21 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG22 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG23 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG24 goldfish.fa >> goldfish1.fa
seqkit grep -n -p LG25 goldfish.fa >> goldfish1.fa

seqkit grep -n -p LG26 goldfish.fa > goldfish2.fa
seqkit grep -n -p LG27 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG28 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG28B goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG29 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG30 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG30F goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG31 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG32 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG33 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG34 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG35 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG36 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG36F goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG37 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG37M goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG38 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG39 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG40 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG41 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG42 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG42F goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG43 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG44 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG44F goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG45 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG45M goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG46 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG47 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG48 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG48F goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG49 goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG49B goldfish.fa >> goldfish2.fa
seqkit grep -n -p LG50 goldfish.fa >> goldfish2.fa
minimap2 -x splice -t 11 -k 12 -a -p 0.4 -N 20 goldfish1.fa cds.fa > carAur1.sam
minimap2 -x splice -t 11 -k 12 -a -p 0.4 -N 20 goldfish2.fa cds.fa > carAur2.sam


/usr/bin/time -v anchorwave proali -i Danio_rerio.GRCz11.102.chr.gff3 -r Danio_rerio.GRCz11.dna.primary_assembly.fa -a carAur1.sam -as cds.fa -ar ref.sam -s goldfish1.fa -n alignh1.anchors -o alignh1.maf -R 1 -Q 1 -f alignh1.f.maf -B -4 -O1 -4 -E1 -2 -O2 -80 -E2 -1 -w 38000 -fa3 200000 > fishlogh1 2>&1
/usr/bin/time -v anchorwave proali -i Danio_rerio.GRCz11.102.chr.gff3 -r Danio_rerio.GRCz11.dna.primary_assembly.fa -a carAur2.sam -as cds.fa -ar ref.sam -s goldfish2.fa -n alignh2.anchors -o alignh2.maf -R 1 -Q 1 -f alignh2.f.maf -w 120000 -fa 80000 -fa2 120000 -fa3 80000 > fishlogh2 2>&1
maf-convert sam alignh1.maf | sed 's/Danio_rerio.GRCz11.dna.primary_assembly.fa.//g' | sed 's/goldfish1.fa.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > alignh1.bam
samtools depth alignh1.bam | wc -l #1103736001
samtools depth alignh1.bam | awk '$3>0{print $0}' | wc -l # 427549895


maf-convert sam alignh2.maf | sed 's/Danio_rerio.GRCz11.dna.primary_assembly.fa.//g' | sed 's/goldfish2.fa.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > alignh2.bam
samtools depth alignh2.bam | wc -l #949917011
samtools depth alignh2.bam | awk '$3>0{print $0}' | wc -l # 352417508

samtools merge alignh_seperate.sam alignh1.bam alignh2.bam
samtools depth alignh_seperate.sam | wc -l #1256261085
samtools depth alignh_seperate.sam | awk '$3>0{print $0}' | wc -l # 629358172


minimap2 -x asm5 -t 78 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish1.fa > minimap2_goldfish1_5.sam
minimap2 -x asm10 -t 78 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish1.fa > minimap2_goldfish1_10.sam
minimap2 -x asm20 -t 78 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish1.fa > minimap2_goldfish1_20.sam

minimap2 -x asm5 -t 78 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish2.fa > minimap2_goldfish2_5.sam
minimap2 -x asm10 -t 78 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish2.fa > minimap2_goldfish2_10.sam
minimap2 -x asm20 -t 78 -a Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish2.fa > minimap2_goldfish2_20.sam




samtools sort minimap2_goldfish1_5.sam > minimap2_goldfish1_5.bam
samtools sort minimap2_goldfish1_10.sam > minimap2_goldfish1_10.bam
samtools sort minimap2_goldfish1_20.sam > minimap2_goldfish1_20.bam

samtools depth minimap2_goldfish1_5.bam | wc -l   # 1091486
samtools depth minimap2_goldfish1_10.bam | wc -l   # 5157696
samtools depth minimap2_goldfish1_20.bam | wc -l   # 29328585

samtools depth minimap2_goldfish1_5.bam | awk '$3>0{print $0}' | wc -l # 1085063
samtools depth minimap2_goldfish1_10.bam | awk '$3>0{print $0}' | wc -l # 5090084
samtools depth minimap2_goldfish1_20.bam | awk '$3>0{print $0}' | wc -l # 28551676



samtools sort minimap2_goldfish2_5.sam > minimap2_goldfish2_5.bam
samtools sort minimap2_goldfish2_10.sam > minimap2_goldfish2_10.bam
samtools sort minimap2_goldfish2_20.sam > minimap2_goldfish2_20.bam

samtools depth minimap2_goldfish2_5.bam | wc -l   # 571183
samtools depth minimap2_goldfish2_10.bam | wc -l   # 3344970
samtools depth minimap2_goldfish2_20.bam | wc -l   # 21840870

samtools depth minimap2_goldfish2_5.bam | awk '$3>0{print $0}' | wc -l #  567843
samtools depth minimap2_goldfish2_10.bam | awk '$3>0{print $0}' | wc -l # 3297946
samtools depth minimap2_goldfish2_20.bam | awk '$3>0{print $0}' | wc -l #  21215193



samtools merge minimap2_goldfish_5_seperate.bam minimap2_goldfish1_5.sam minimap2_goldfish2_5.sam
samtools merge minimap2_goldfish_10_seperate.bam minimap2_goldfish1_10.sam minimap2_goldfish2_10.sam
samtools merge minimap2_goldfish_20_seperate.bam minimap2_goldfish1_20.sam minimap2_goldfish2_20.sam

samtools sort minimap2_goldfish_5_seperate.bam  > minimap2_goldfish_5_seperate_sort.bam
samtools sort minimap2_goldfish_10_seperate.bam  > minimap2_goldfish_10_seperate_sort.bam
samtools sort minimap2_goldfish_20_seperate.bam  > minimap2_goldfish_20_seperate_sort.bam


samtools depth minimap2_goldfish_5_seperate_sort.bam | wc -l   # 1350589
samtools depth minimap2_goldfish_10_seperate_sort.bam | wc -l   # 6407183
samtools depth minimap2_goldfish_20_seperate_sort.bam | wc -l   # 35881644

samtools depth minimap2_goldfish_5_seperate_sort.bam | awk '$3>0{print $0}' | wc -l #  1343192
samtools depth minimap2_goldfish_10_seperate_sort.bam | awk '$3>0{print $0}' | wc -l # 6329847
samtools depth minimap2_goldfish_20_seperate_sort.bam | awk '$3>0{print $0}' | wc -l #  35002774


lastdb -P0 zebrafish Danio_rerio.GRCz11.dna.primary_assembly.fa
faToTwoBit Danio_rerio.GRCz11.dna.primary_assembly.fa zebrafish.2bit
faSize -detailed Danio_rerio.GRCz11.dna.primary_assembly.fa > zebrafish.size
lastal zebrafish goldfish1.fa > goldfish1_lastal.maf
faSize -detailed goldfish1.fa > goldfish1.size
faToTwoBit goldfish1.fa goldfish1.2bit
python2 /programs/last-932/bin/maf-convert psl goldfish1_lastal.maf > goldfish1_lastal.psl

axtChain -linearGap=loose -psl goldfish1_lastal.psl -faQ -faT Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish1.fa goldfish1_lastal.chain
chainMergeSort goldfish1_lastal.chain > goldfish1_lastal.all.chain
chainPreNet goldfish1_lastal.all.chain zebrafish.size goldfish1.size goldfish1_lastal.preNet
chainNet goldfish1_lastal.preNet zebrafish.size goldfish1.size goldfish1_lastal.refTarget.net goldfish1_lastal.chainNet
netToAxt goldfish1_lastal.refTarget.net goldfish1_lastal.preNet zebrafish.2bit goldfish1.2bit stdout | axtSort stdin goldfish1_lastal.axt
axtToMaf goldfish1_lastal.axt zebrafish.size goldfish1.size goldfish1_lastal_final.maf -qPrefix=query. -tPrefix=col.
perl lastFinalToSplit.pl goldfish1_lastal_final.maf > goldfish1_lastal_final_forsplit.maf
cat goldfish1_lastal.maf | last-split > goldfish1_lastal_split.maf
cat goldfish1_lastal_final_forsplit.maf | last-split > goldfish1_lastal_final_split.maf

python2 /programs/last-932/bin/maf-convert sam goldfish1_lastal.maf| samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish1_lastal.bam
python2 /programs/last-932/bin/maf-convert sam goldfish1_lastal_final.maf| sed 's/col.//g' | sed 's/query.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish1_lastal_final.bam
python2 /programs/last-932/bin/maf-convert sam goldfish1_lastal_split.maf | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish1_lastal_split.bam
python2 /programs/last-932/bin/maf-convert sam goldfish1_lastal_final_split.maf | sed 's/col.//g' | sed 's/query.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish1_lastal_final_split.bam

samtools index goldfish1_lastal.bam
samtools index goldfish1_lastal_final.bam
samtools index goldfish1_lastal_split.bam
samtools index goldfish1_lastal_final_split.bam

samtools depth goldfish1_lastal.bam | wc -l   # 556566186
samtools depth goldfish1_lastal_split.bam | wc -l # 175401405
samtools depth goldfish1_lastal_final.bam | wc -l   # 506859989
samtools depth goldfish1_lastal_final_split.bam | wc -l   # 207853751

samtools depth goldfish1_lastal.bam | awk '$3>0{print $0}' | wc -l #  552251259
samtools depth goldfish1_lastal_split.bam | awk '$3>0{print $0}' | wc -l # 171964354
samtools depth goldfish1_lastal_final.bam | awk '$3>0{print $0}' | wc -l # 498521800
samtools depth goldfish1_lastal_final_split.bam | awk '$3>0{print $0}'| wc -l # 204194727




lastdb -P0 zebrafish Danio_rerio.GRCz11.dna.primary_assembly.fa
faToTwoBit Danio_rerio.GRCz11.dna.primary_assembly.fa zebrafish.2bit
faSize -detailed Danio_rerio.GRCz11.dna.primary_assembly.fa > zebrafish.size
lastal zebrafish goldfish2.fa > goldfish2_lastal.maf
faSize -detailed goldfish2.fa > goldfish2.size
faToTwoBit goldfish2.fa goldfish2.2bit
python2 /programs/last-932/bin/maf-convert psl goldfish2_lastal.maf > goldfish2_lastal.psl

axtChain -linearGap=loose -psl goldfish2_lastal.psl -faQ -faT Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish2.fa goldfish2_lastal.chain
chainMergeSort goldfish2_lastal.chain > goldfish2_lastal.all.chain
chainPreNet goldfish2_lastal.all.chain zebrafish.size goldfish2.size goldfish2_lastal.preNet
chainNet goldfish2_lastal.preNet zebrafish.size goldfish2.size goldfish2_lastal.refTarget.net goldfish2_lastal.chainNet
netToAxt goldfish2_lastal.refTarget.net goldfish2_lastal.preNet zebrafish.2bit goldfish2.2bit stdout | axtSort stdin goldfish2_lastal.axt
axtToMaf goldfish2_lastal.axt zebrafish.size goldfish2.size goldfish2_lastal_final.maf -qPrefix=query. -tPrefix=col.
perl lastFinalToSplit.pl goldfish2_lastal_final.maf > goldfish2_lastal_final_forsplit.maf
cat goldfish2_lastal.maf | last-split > goldfish2_lastal_split.maf
cat goldfish2_lastal_final_forsplit.maf | last-split > goldfish2_lastal_final_split.maf

python2 /programs/last-932/bin/maf-convert sam goldfish2_lastal.maf| samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish2_lastal.bam
python2 /programs/last-932/bin/maf-convert sam goldfish2_lastal_final.maf| sed 's/col.//g' | sed 's/query.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish2_lastal_final.bam
python2 /programs/last-932/bin/maf-convert sam goldfish2_lastal_split.maf | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish2_lastal_split.bam
python2 /programs/last-932/bin/maf-convert sam goldfish2_lastal_final_split.maf | sed 's/col.//g' | sed 's/query.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish2_lastal_final_split.bam

samtools index goldfish2_lastal.bam
samtools index goldfish2_lastal_final.bam
samtools index goldfish2_lastal_split.bam
samtools index goldfish2_lastal_final_split.bam

samtools depth goldfish2_lastal.bam | wc -l   # 512932320
samtools depth goldfish2_lastal_split.bam | wc -l # 145622396
samtools depth goldfish2_lastal_final.bam | wc -l   # 467063301
samtools depth goldfish2_lastal_final_split.bam | wc -l   # 173644026

samtools depth goldfish2_lastal.bam | awk '$3>0{print $0}' | wc -l #  509004222
samtools depth goldfish2_lastal_split.bam | awk '$3>0{print $0}' | wc -l # 142626377
samtools depth goldfish2_lastal_final.bam | awk '$3>0{print $0}' | wc -l # 459369555
samtools depth goldfish2_lastal_final_split.bam | awk '$3>0{print $0}'| wc -l # 170487273



samtools merge goldfish_lastal_seperate.bam goldfish1_lastal.bam goldfish2_lastal.bam
samtools sort goldfish_lastal_seperate.bam  > goldfish_lastal_seperate_sort.bam
samtools depth goldfish_lastal_seperate_sort.bam | wc -l   # 628324746
samtools depth goldfish_lastal_seperate_sort.bam | awk '$3>0{print $0}' | wc -l #  624893762

samtools merge goldfish_lastal_split_seperate.bam goldfish1_lastal_split.bam goldfish2_lastal_split.bam
samtools sort goldfish_lastal_split_seperate.bam  > goldfish_lastal_split_seperate_sort.bam
samtools depth goldfish_lastal_split_seperate_sort.bam | wc -l   # 210589780
samtools depth goldfish_lastal_split_seperate_sort.bam | awk '$3>0{print $0}' | wc -l #  207926386

samtools merge goldfish_lastal_final_seperate.bam goldfish1_lastal_final.bam goldfish2_lastal_final.bam
samtools sort goldfish_lastal_final_seperate.bam  > goldfish_lastal_final_seperate_sort.bam
<!-- samtools depth goldfish_lastal_final_seperate_sort.bam | wc -l   # 592717560 -->
samtools depth goldfish_lastal_final_seperate_sort.bam | awk '$3>0{print $0}' | wc -l #  587452812

samtools merge goldfish_lastal_final_split_seperate.bam goldfish1_lastal_final_split.bam goldfish2_lastal_final_split.bam
samtools sort goldfish_lastal_final_split_seperate.bam  > goldfish_lastal_final_split_seperate_sort.bam
samtools depth goldfish_lastal_final_split_seperate_sort.bam | wc -l   # 264115166
samtools depth goldfish_lastal_final_split_seperate_sort.bam | awk '$3>0{print $0}' | wc -l #  261043159

perform genome alignment using MUMmer4 (https://mummer4.github.io/) and summarize the result

nucmer -t 75 --sam-long=mumer.goldfish1.short.sam Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish1.fa
cat mumer.goldfish1.short.sam | grep -v "@" | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > mumer.goldfish1.short.bam
samtools index mumer.goldfish1.short.bam # 
samtools depth mumer.goldfish1.short.bam | wc -l   # 76143143
samtools depth mumer.goldfish1.short.bam | awk '$3>0{print $0}' | wc -l   # 75031298



nucmer -t 75 --sam-long=mumer.goldfish2.short.sam Danio_rerio.GRCz11.dna.primary_assembly.fa goldfish2.fa
cat mumer.goldfish2.short.sam | grep -v "@" | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > mumer.goldfish2.short.bam
samtools index mumer.goldfish2.short.bam #
samtools depth mumer.goldfish2.short.bam | wc -l   # 60735316
samtools depth mumer.goldfish2.short.bam | awk '$3>0{print $0}' | wc -l # 59835463


samtools merge mumer.goldfish.short_seperate.bam mumer.goldfish1.short.bam mumer.goldfish2.short.bam
samtools sort mumer.goldfish.short_seperate.bam  > mumer.goldfish.short_seperate_sort.bam
samtools depth mumer.goldfish.short_seperate_sort.bam | wc -l   # 94867701
samtools depth mumer.goldfish.short_seperate_sort.bam | awk '$3>0{print $0}' | wc -l #  93794614

perform genome alignment using GSAlign(https://github.com/hsinnan75/GSAlign) and summarize the result

GSAlign -r Danio_rerio.GRCz11.dna.primary_assembly.fa -q goldfish1.fa -t 18 -o goldfish1_gsalign -fmt 1
python2 /programs/last-932/bin/maf-convert sam goldfish1_gsalign.maf | sed 's/qry.//g' | sed 's/ref.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish1_gsalign.bam
samtools index goldfish1_gsalign.bam
samtools depth goldfish1_gsalign.bam | wc -l # 13165258
samtools depth goldfish1_gsalign.bam | awk '$3>0{print $0}' | wc -l # 12830191


GSAlign -r Danio_rerio.GRCz11.dna.primary_assembly.fa -q goldfish2.fa -t 70 -o goldfish2_gsalign -fmt 1
python2 /programs/last-932/bin/maf-convert sam goldfish2_gsalign.maf | sed 's/qry.//g' | sed 's/ref.//g' | samtools view -O BAM --reference Danio_rerio.GRCz11.dna.primary_assembly.fa - | samtools sort - > goldfish2_gsalign.bam
samtools index goldfish2_gsalign.bam
samtools depth goldfish2_gsalign.bam | wc -l #  9909623
samtools depth goldfish2_gsalign.bam | awk '$3>0{print $0}' | wc -l # 9638825


samtools merge goldfish2_gsalign_seperate.bam goldfish1_gsalign.bam goldfish2_gsalign.bam
samtools sort goldfish2_gsalign_seperate.bam  >goldfish2_gsalign_seperate_sort.bam
samtools depth goldfish2_gsalign_seperate_sort.bam | wc -l   # 20038773
samtools depth goldfish2_gsalign_seperate_sort.bam | awk '$3>0{print $0}' | wc -l #  19553866



genomesize = read.table("Danio_rerio.GRCz11.dna.primary_assembly.fa.fai")
genomesize = sum(genomesize$V2)


library(ggplot2)
dat = read.table("goldFishSummary2", sep="\t", header=TRUE)
dat$category = factor(dat$category, levels=c("unaligned", "gap", "position match"))
dat$approach <- factor(dat$approach, levels = c("AnchorWave", "minimap2_asm5", "minimap2_asm10", "minimap2_asm20", "LAST+many-to-many", "LAST+many-to-one", "LAST+one-to-one", "MUMmer4", "GSAlign"))

dat$proportion = dat$proportion/genomesize
p = ggplot(data=dat, aes(x=approach, y=proportion, fill=category)) + geom_bar(stat="identity")  + scale_fill_manual(values=c("#55B0E4", "#FFF800", "#FFBD00")) + guides(fill=guide_legend(nrow=1,byrow=TRUE))+
  labs(x="", y="Genome porportion", title="")+
            theme_bw() +theme_grey(base_size = 24) + theme(
        strip.background = element_blank(),
        strip.text.x = element_blank(),
        legend.title = element_blank(),
              axis.line = element_blank(),
        legend.position = "top",
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            panel.border = element_blank(),
        
            panel.background = element_blank(),
            axis.text.y = element_text( colour = "black"),
            axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))

print(p)

file = "genome_aligned2"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png 
##   2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png 
##   2
d = read.table("goldfish.fa.fai")
sum(d$V2)/1000/1000/1000 # 1.273912
## [1] 1.273912
d = read.table("Danio_rerio.GRCz11.dna.primary_assembly.fa.fai")
d = head(d, 25)
sum(d$V2)/1000/1000/1000 # 1.345102
## [1] 1.345102